Packages installation

Install the following packages and load them:

install.packages("devtools","expm", "ggplot2", "NPflow", "truncnorm")
devtools::install_github("borishejblum/CRPdemo")
library(NPflow)
library(CRPdemo)
library(ggplot2)

Synthetic data

n <- 100 # datasize
set.seed(1231)

# Sample data
m <- matrix(nrow=2, ncol=4, c(-1, 1, 0, 2, 1, -2, -1, -2))
p <- c(0.2, 0.1, 0.4, 0.3) # frequence des clusters

library(expm)
s <- array(dim=c(2,2,4))
s[, ,1] <- matrix(nrow=2, ncol=2, c(0.1, 0, 0, 0.1))
s[, ,2] <- matrix(nrow=2, ncol=2, c(0.01, 0, 0, 0.1))
s[, ,3] <- matrix(nrow=2, ncol=2, c(0.1, 0.08, 0.08, 0.1))
s[, ,4] <- .1*diag(2)
c <- rep(0,n)
z <- matrix(0, nrow=2, ncol=n)
for(k in 1:n){
    c[k] = which(rmultinom(n=1, size=1, prob=p)!=0)
    z[,k] <- m[, c[k]] + expm::sqrtm(s[, , c[k]])%*%matrix(rnorm(2, mean = 0, sd = 1), nrow=2, ncol=1)
}

plot(t(z), pch=16, xlim=c(-3,3), ylim=c(-3,3), xlab="Dim 1", ylab="Dim 2")

Marginal algorithms

We first need to define prior distribution and set hyper parameters.

hyperG0 <- list()
hyperG0[["mu"]] <- c(0,0)
hyperG0[["kappa"]] <- 1
hyperG0[["nu"]] <- 4
hyperG0[["lambda"]] <- diag(2)
# Scale parameter of DPM
alpha <- 2

#number f iteration
N <- 5 #15

Polya Urn representation

gibbsDPMalgo1(z, hyperG0, alpha, niter = N, doPlot = TRUE) #, pause = 0

Try increasing the number of iterations

Chinese Restaurant Process representation

gibbsDPMalgo2(z, hyperG0, alpha, niter = N, doPlot = TRUE) #, pause = 0

Now let’s try to change alpha and see how it impacts the algorithm

gibbsDPMalgo2(z, hyperG0, alpha = 0.0001, niter=N, doPlot=TRUE) #, pause = 0

gibbsDPMalgo2(z, hyperG0, alpha = 30, niter=N, doPlot=TRUE) #, pause = 0

Another important parameter is the prior on the variance of each cluster in the base distribution \(\mathcal{A}_0\). Let us increase this variance and see how it impacts the lagorithm:

hyperG0[["lambda"]] <- 100*diag(2)
expr = gibbsDPMalgo2(z, hyperG0, alpha = 2, niter=N, doPlot=TRUE) #, pause = 0

Partially collapsed Gibbs sampler

Now we are going to use the package NPflow

?DPMGibbsN
hyperG0[["lambda"]] <- diag(2)/10
MCMCsample <- DPMGibbsN(z, hyperG0, a=0.001, b=0.001, N=500, doPlot = TRUE, 
                        nbclust_init=30, plotevery=100, diagVar=FALSE)

plot_ConvDPM(MCMCsample, from=2)
s <- summary(MCMCsample, burnin = 200, thin=2, posterior_approx=FALSE,
             lossFn = "Binder")

print(s)
## summaryDPMMclust object with 150 observations sampled from the posterior:
## ------------------------------------------------------------------------
## Burnin: 200 MCMC iterations discarded
## 
## Point estimate of the partition with a gaussian mixture:
##     2    12    15    30 
## 0.32%  0.4% 0.07% 0.21%
plot(s, hm=TRUE)

NPflow can also deal with skewed and heavy-tailed data :

library(truncnorm)

# data
n <- 2000
set.seed(4321)
d <- 2
ncl <- 4

sdev <- array(dim=c(d,d,ncl))
xi <- matrix(nrow=d, ncol=ncl, c(-0.2, 0.5, 2.4, 0.4, 0.6, -1.3, -0.9, -2.7))
psi <- matrix(nrow=d, ncol=4, c(0.3, -0.7, -0.8, 0, 0.3, -0.7, 0.2, 0.9))
nu <- c(100,25,8,5)
p <- c(0.15, 0.05, 0.5, 0.3)
sdev[, ,1] <- matrix(nrow=d, ncol=d, c(0.3, 0, 0, 0.3))
sdev[, ,2] <- matrix(nrow=d, ncol=d, c(0.1, 0, 0, 0.3))
sdev[, ,3] <- matrix(nrow=d, ncol=d, c(0.3, 0.15, 0.15, 0.3))
sdev[, ,4] <- .3*diag(2)


c <- rep(0,n)
w <- rep(1,n)
z <- matrix(0, nrow=d, ncol=n)
for(k in 1:n){
    c[k] = which(rmultinom(n=1, size=1, prob=p)!=0)
    w[k] <- rgamma(1, shape=nu[c[k]]/2, rate=nu[c[k]]/2)
    z[,k] <- xi[, c[k]] + psi[, c[k]]*rtruncnorm(n=1, a=0, b=Inf, mean=0, sd=1/sqrt(w[k])) +
        (sdev[, , c[k]]/sqrt(w[k]))%*%matrix(rnorm(d, mean = 0, sd = 1), nrow=d, ncol=1)
}

ggplot(data.frame("X"=z[1,], "Y"=z[2,]), aes(x=X, y=Y)) +
    geom_point() +
    xlab("D1") +
    ylab("D2") +
    theme_bw()

c2plot <- factor(c)
levels(c2plot) <- c("4", "1", "3", "2")
ggplot(data.frame("X"=z[1,], "Y"=z[2,], "Cluster"=as.character(c2plot))) +
    geom_point(aes(x=X, y=Y, colour=Cluster, fill=Cluster)) +
    xlab("D1") +
    ylab("D2") +
    theme_bw() +
    scale_colour_discrete(guide=guide_legend(override.aes = list(size = 6, shape=22)))

# Set parameters of G0
hyperG0 <- list()
hyperG0[["b_xi"]] <- rowMeans(z)
hyperG0[["b_psi"]] <- rep(0,d)
hyperG0[["kappa"]] <- 0.001
hyperG0[["D_xi"]] <- 100
hyperG0[["D_psi"]] <- 100
hyperG0[["nu"]] <- d+1
hyperG0[["lambda"]] <- diag(apply(z,MARGIN=1, FUN=var))/3

# hyperprior on the Scale parameter of DPM
a <- 0.0001
b <- 0.0001
# Gibbs sampler for Dirichlet Process Mixtures
MCMCsample_st <- DPMGibbsSkewT(z, hyperG0, a, b, N=1500,
                               doPlot=TRUE, nbclust_init=30, plotevery=100,
                               diagVar=FALSE)

s <- summary(MCMCsample_st, burnin = 1000, thin=10, lossFn = "Binder")
print(s)
## summaryDPMMclust object with 50 observations sampled from the posterior:
## ------------------------------------------------------------------------
## Burnin: 1000 MCMC iterations discarded
## 
## Point estimate of the partition with a skewt mixture:
##      2      6     16     25 
##  0.49% 0.057%  0.14%  0.32%
plot(s, hm=TRUE)
plot_ConvDPM(MCMCsample_st, from=2)